EDA¶
In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor": "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
In [3]:
# importing raw data
df_eda = pd.read_csv("eda.csv")
In [4]:
df_eda.shape
Out[4]:
(21597, 22)
In [5]:
df_eda.head(7)
Out[5]:
| sale_id | date | price | id | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2014-10-13 | 221900.000 | 7129300520 | 3.000 | 1.000 | 1180.000 | 5650.000 | 1.000 | NaN | ... | 7 | 1180.000 | 0.000 | 1955 | 0.000 | 98178 | 47.511 | -122.257 | 1340.000 | 5650.000 |
| 1 | 2 | 2014-12-09 | 538000.000 | 6414100192 | 3.000 | 2.250 | 2570.000 | 7242.000 | 2.000 | 0.000 | ... | 7 | 2170.000 | 400.000 | 1951 | 19910.000 | 98125 | 47.721 | -122.319 | 1690.000 | 7639.000 |
| 2 | 3 | 2015-02-25 | 180000.000 | 5631500400 | 2.000 | 1.000 | 770.000 | 10000.000 | 1.000 | 0.000 | ... | 6 | 770.000 | 0.000 | 1933 | NaN | 98028 | 47.738 | -122.233 | 2720.000 | 8062.000 |
| 3 | 4 | 2014-12-09 | 604000.000 | 2487200875 | 4.000 | 3.000 | 1960.000 | 5000.000 | 1.000 | 0.000 | ... | 7 | 1050.000 | 910.000 | 1965 | 0.000 | 98136 | 47.521 | -122.393 | 1360.000 | 5000.000 |
| 4 | 5 | 2015-02-18 | 510000.000 | 1954400510 | 3.000 | 2.000 | 1680.000 | 8080.000 | 1.000 | 0.000 | ... | 8 | 1680.000 | 0.000 | 1987 | 0.000 | 98074 | 47.617 | -122.045 | 1800.000 | 7503.000 |
| 5 | 6 | 2014-05-12 | 1230000.000 | 7237550310 | 4.000 | 4.500 | 5420.000 | 101930.000 | 1.000 | 0.000 | ... | 11 | 3890.000 | 1530.000 | 2001 | 0.000 | 98053 | 47.656 | -122.005 | 4760.000 | 101930.000 |
| 6 | 7 | 2014-06-27 | 257500.000 | 1321400060 | 3.000 | 2.250 | 1715.000 | 6819.000 | 2.000 | 0.000 | ... | 7 | 1715.000 | NaN | 1995 | 0.000 | 98003 | 47.310 | -122.327 | 2238.000 | 6819.000 |
7 rows × 22 columns
In [6]:
df_eda.describe()
Out[6]:
| sale_id | price | id | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 19206.000 | 21534.000 | ... | 21597.000 | 21597.000 | 21145.000 | 21597.000 | 17749.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 | 21597.000 |
| mean | 10799.000 | 540296.574 | 4580474287.771 | 3.373 | 2.116 | 2080.322 | 15099.409 | 1.494 | 0.008 | 0.234 | ... | 7.658 | 1788.597 | 291.857 | 1971.000 | 836.651 | 98077.952 | 47.560 | -122.214 | 1986.620 | 12758.284 |
| std | 6234.661 | 367368.140 | 2876735715.748 | 0.926 | 0.769 | 918.106 | 41412.637 | 0.540 | 0.087 | 0.766 | ... | 1.173 | 827.760 | 442.491 | 29.375 | 4000.111 | 53.513 | 0.139 | 0.141 | 685.230 | 27274.442 |
| min | 1.000 | 78000.000 | 1000102.000 | 1.000 | 0.500 | 370.000 | 520.000 | 1.000 | 0.000 | 0.000 | ... | 3.000 | 370.000 | 0.000 | 1900.000 | 0.000 | 98001.000 | 47.156 | -122.519 | 399.000 | 651.000 |
| 25% | 5400.000 | 322000.000 | 2123049175.000 | 3.000 | 1.750 | 1430.000 | 5040.000 | 1.000 | 0.000 | 0.000 | ... | 7.000 | 1190.000 | 0.000 | 1951.000 | 0.000 | 98033.000 | 47.471 | -122.328 | 1490.000 | 5100.000 |
| 50% | 10799.000 | 450000.000 | 3904930410.000 | 3.000 | 2.250 | 1910.000 | 7618.000 | 1.500 | 0.000 | 0.000 | ... | 7.000 | 1560.000 | 0.000 | 1975.000 | 0.000 | 98065.000 | 47.572 | -122.231 | 1840.000 | 7620.000 |
| 75% | 16198.000 | 645000.000 | 7308900490.000 | 4.000 | 2.500 | 2550.000 | 10685.000 | 2.000 | 0.000 | 0.000 | ... | 8.000 | 2210.000 | 560.000 | 1997.000 | 0.000 | 98118.000 | 47.678 | -122.125 | 2360.000 | 10083.000 |
| max | 21597.000 | 7700000.000 | 9900000190.000 | 33.000 | 8.000 | 13540.000 | 1651359.000 | 3.500 | 1.000 | 4.000 | ... | 13.000 | 9410.000 | 4820.000 | 2015.000 | 20150.000 | 98199.000 | 47.778 | -121.315 | 6210.000 | 871200.000 |
8 rows × 21 columns
In [7]:
df_eda.tail(7)
Out[7]:
| sale_id | date | price | id | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 21590 | 21591 | 2015-03-26 | 1010000.000 | 7936000429 | 4.000 | 3.500 | 3510.000 | 7200.000 | 2.000 | 0.000 | ... | 9 | 2600.000 | 910.000 | 2009 | 0.000 | 98136 | 47.554 | -122.398 | 2050.000 | 6200.000 |
| 21591 | 21592 | 2015-02-19 | 475000.000 | 2997800021 | 3.000 | 2.500 | 1310.000 | 1294.000 | 2.000 | 0.000 | ... | 8 | 1180.000 | 130.000 | 2008 | 0.000 | 98116 | 47.577 | -122.409 | 1330.000 | 1265.000 |
| 21592 | 21593 | 2014-05-21 | 360000.000 | 263000018 | 3.000 | 2.500 | 1530.000 | 1131.000 | 3.000 | 0.000 | ... | 8 | 1530.000 | 0.000 | 2009 | 0.000 | 98103 | 47.699 | -122.346 | 1530.000 | 1509.000 |
| 21593 | 21594 | 2015-02-23 | 400000.000 | 6600060120 | 4.000 | 2.500 | 2310.000 | 5813.000 | 2.000 | 0.000 | ... | 8 | 2310.000 | 0.000 | 2014 | 0.000 | 98146 | 47.511 | -122.362 | 1830.000 | 7200.000 |
| 21594 | 21595 | 2014-06-23 | 402101.000 | 1523300141 | 2.000 | 0.750 | 1020.000 | 1350.000 | 2.000 | 0.000 | ... | 7 | 1020.000 | 0.000 | 2009 | 0.000 | 98144 | 47.594 | -122.299 | 1020.000 | 2007.000 |
| 21595 | 21596 | 2015-01-16 | 400000.000 | 291310100 | 3.000 | 2.500 | 1600.000 | 2388.000 | 2.000 | NaN | ... | 8 | 1600.000 | 0.000 | 2004 | 0.000 | 98027 | 47.535 | -122.069 | 1410.000 | 1287.000 |
| 21596 | 21597 | 2014-10-15 | 325000.000 | 1523300157 | 2.000 | 0.750 | 1020.000 | 1076.000 | 2.000 | 0.000 | ... | 7 | 1020.000 | 0.000 | 2008 | 0.000 | 98144 | 47.594 | -122.299 | 1020.000 | 1357.000 |
7 rows × 22 columns
In [8]:
df_eda.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21597 entries, 0 to 21596 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sale_id 21597 non-null int64 1 date 21597 non-null object 2 price 21597 non-null float64 3 id 21597 non-null int64 4 bedrooms 21597 non-null float64 5 bathrooms 21597 non-null float64 6 sqft_living 21597 non-null float64 7 sqft_lot 21597 non-null float64 8 floors 21597 non-null float64 9 waterfront 19206 non-null float64 10 view 21534 non-null float64 11 condition 21597 non-null int64 12 grade 21597 non-null int64 13 sqft_above 21597 non-null float64 14 sqft_basement 21145 non-null float64 15 yr_built 21597 non-null int64 16 yr_renovated 17749 non-null float64 17 zipcode 21597 non-null int64 18 lat 21597 non-null float64 19 long 21597 non-null float64 20 sqft_living15 21597 non-null float64 21 sqft_lot15 21597 non-null float64 dtypes: float64(15), int64(6), object(1) memory usage: 3.6+ MB
In [9]:
df_eda.duplicated().value_counts()
Out[9]:
False 21597 Name: count, dtype: int64
In [10]:
df_eda.dtypes
Out[10]:
sale_id int64 date object price float64 id int64 bedrooms float64 bathrooms float64 sqft_living float64 sqft_lot float64 floors float64 waterfront float64 view float64 condition int64 grade int64 sqft_above float64 sqft_basement float64 yr_built int64 yr_renovated float64 zipcode int64 lat float64 long float64 sqft_living15 float64 sqft_lot15 float64 dtype: object
In [11]:
type(df_eda['date'][0])
Out[11]:
str
In [12]:
# change "date" dtype to datetime with format %Y/%m/%d
df_eda['date'] = pd.to_datetime(df_eda['date'], format='%Y-%m-%d')
In [13]:
type(df_eda['date'][0])
Out[13]:
pandas._libs.tslibs.timestamps.Timestamp
In [14]:
# display number of distinct elements
df_eda.grade.nunique()
Out[14]:
11
In [15]:
df_eda['grade'].unique()
Out[15]:
array([ 7, 6, 8, 11, 9, 5, 10, 12, 4, 3, 13])
In [16]:
import missingno as msno
In [17]:
# display number of missing values per column
df_eda.isna().sum()
Out[17]:
sale_id 0 date 0 price 0 id 0 bedrooms 0 bathrooms 0 sqft_living 0 sqft_lot 0 floors 0 waterfront 2391 view 63 condition 0 grade 0 sqft_above 0 sqft_basement 452 yr_built 0 yr_renovated 3848 zipcode 0 lat 0 long 0 sqft_living15 0 sqft_lot15 0 dtype: int64
In [18]:
msno.bar(df_eda)
Out[18]:
<Axes: >
In [19]:
msno.matrix(df_eda)
Out[19]:
<Axes: >
In [20]:
print(f"numbers of rows : {df_eda.shape[0]}")
print(f"missing values in waterfront : {round(df_eda.waterfront.isna().sum()/df_eda.shape[0]*100,2)} %")
print(f"missing values in sqft_basement : {round(df_eda.sqft_basement.isna().sum()/df_eda.shape[0]*100,2)} %")
print(f"missing values in yr_renovated : {round(df_eda.yr_renovated.isna().sum()/df_eda.shape[0]*100,2)} %")
print(f"missing values in view : {round(df_eda.view.isna().sum()/df_eda.shape[0]*100,2)} %")
print(f"missing values in data frame : {round(df_eda.isna().sum().sum()/(df_eda.shape[0]*df_eda.shape[1])*100,2)} %")
numbers of rows : 21597 missing values in waterfront : 11.07 % missing values in sqft_basement : 2.09 % missing values in yr_renovated : 17.82 % missing values in view : 0.29 % missing values in data frame : 1.42 %
In [21]:
df_new = df_eda
df_new.fillna(0, inplace=True)
df_new.fillna({'waterfront': 0, 'sqrt_basement': 291.857, 'view': 0, 'yr_renovated': 0}, inplace=True)
df_new
Out[21]:
| sale_id | date | price | id | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2014-10-13 | 221900.000 | 7129300520 | 3.000 | 1.000 | 1180.000 | 5650.000 | 1.000 | 0.000 | ... | 7 | 1180.000 | 0.000 | 1955 | 0.000 | 98178 | 47.511 | -122.257 | 1340.000 | 5650.000 |
| 1 | 2 | 2014-12-09 | 538000.000 | 6414100192 | 3.000 | 2.250 | 2570.000 | 7242.000 | 2.000 | 0.000 | ... | 7 | 2170.000 | 400.000 | 1951 | 19910.000 | 98125 | 47.721 | -122.319 | 1690.000 | 7639.000 |
| 2 | 3 | 2015-02-25 | 180000.000 | 5631500400 | 2.000 | 1.000 | 770.000 | 10000.000 | 1.000 | 0.000 | ... | 6 | 770.000 | 0.000 | 1933 | 0.000 | 98028 | 47.738 | -122.233 | 2720.000 | 8062.000 |
| 3 | 4 | 2014-12-09 | 604000.000 | 2487200875 | 4.000 | 3.000 | 1960.000 | 5000.000 | 1.000 | 0.000 | ... | 7 | 1050.000 | 910.000 | 1965 | 0.000 | 98136 | 47.521 | -122.393 | 1360.000 | 5000.000 |
| 4 | 5 | 2015-02-18 | 510000.000 | 1954400510 | 3.000 | 2.000 | 1680.000 | 8080.000 | 1.000 | 0.000 | ... | 8 | 1680.000 | 0.000 | 1987 | 0.000 | 98074 | 47.617 | -122.045 | 1800.000 | 7503.000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21592 | 21593 | 2014-05-21 | 360000.000 | 263000018 | 3.000 | 2.500 | 1530.000 | 1131.000 | 3.000 | 0.000 | ... | 8 | 1530.000 | 0.000 | 2009 | 0.000 | 98103 | 47.699 | -122.346 | 1530.000 | 1509.000 |
| 21593 | 21594 | 2015-02-23 | 400000.000 | 6600060120 | 4.000 | 2.500 | 2310.000 | 5813.000 | 2.000 | 0.000 | ... | 8 | 2310.000 | 0.000 | 2014 | 0.000 | 98146 | 47.511 | -122.362 | 1830.000 | 7200.000 |
| 21594 | 21595 | 2014-06-23 | 402101.000 | 1523300141 | 2.000 | 0.750 | 1020.000 | 1350.000 | 2.000 | 0.000 | ... | 7 | 1020.000 | 0.000 | 2009 | 0.000 | 98144 | 47.594 | -122.299 | 1020.000 | 2007.000 |
| 21595 | 21596 | 2015-01-16 | 400000.000 | 291310100 | 3.000 | 2.500 | 1600.000 | 2388.000 | 2.000 | 0.000 | ... | 8 | 1600.000 | 0.000 | 2004 | 0.000 | 98027 | 47.535 | -122.069 | 1410.000 | 1287.000 |
| 21596 | 21597 | 2014-10-15 | 325000.000 | 1523300157 | 2.000 | 0.750 | 1020.000 | 1076.000 | 2.000 | 0.000 | ... | 7 | 1020.000 | 0.000 | 2008 | 0.000 | 98144 | 47.594 | -122.299 | 1020.000 | 1357.000 |
21597 rows × 22 columns
In [22]:
df_numeric = df_eda.select_dtypes(include='number')
print(df_numeric)
sale_id price id bedrooms bathrooms sqft_living \
0 1 221900.000 7129300520 3.000 1.000 1180.000
1 2 538000.000 6414100192 3.000 2.250 2570.000
2 3 180000.000 5631500400 2.000 1.000 770.000
3 4 604000.000 2487200875 4.000 3.000 1960.000
4 5 510000.000 1954400510 3.000 2.000 1680.000
... ... ... ... ... ... ...
21592 21593 360000.000 263000018 3.000 2.500 1530.000
21593 21594 400000.000 6600060120 4.000 2.500 2310.000
21594 21595 402101.000 1523300141 2.000 0.750 1020.000
21595 21596 400000.000 291310100 3.000 2.500 1600.000
21596 21597 325000.000 1523300157 2.000 0.750 1020.000
sqft_lot floors waterfront view ... grade sqft_above \
0 5650.000 1.000 0.000 0.000 ... 7 1180.000
1 7242.000 2.000 0.000 0.000 ... 7 2170.000
2 10000.000 1.000 0.000 0.000 ... 6 770.000
3 5000.000 1.000 0.000 0.000 ... 7 1050.000
4 8080.000 1.000 0.000 0.000 ... 8 1680.000
... ... ... ... ... ... ... ...
21592 1131.000 3.000 0.000 0.000 ... 8 1530.000
21593 5813.000 2.000 0.000 0.000 ... 8 2310.000
21594 1350.000 2.000 0.000 0.000 ... 7 1020.000
21595 2388.000 2.000 0.000 0.000 ... 8 1600.000
21596 1076.000 2.000 0.000 0.000 ... 7 1020.000
sqft_basement yr_built yr_renovated zipcode lat long \
0 0.000 1955 0.000 98178 47.511 -122.257
1 400.000 1951 19910.000 98125 47.721 -122.319
2 0.000 1933 0.000 98028 47.738 -122.233
3 910.000 1965 0.000 98136 47.521 -122.393
4 0.000 1987 0.000 98074 47.617 -122.045
... ... ... ... ... ... ...
21592 0.000 2009 0.000 98103 47.699 -122.346
21593 0.000 2014 0.000 98146 47.511 -122.362
21594 0.000 2009 0.000 98144 47.594 -122.299
21595 0.000 2004 0.000 98027 47.535 -122.069
21596 0.000 2008 0.000 98144 47.594 -122.299
sqft_living15 sqft_lot15
0 1340.000 5650.000
1 1690.000 7639.000
2 2720.000 8062.000
3 1360.000 5000.000
4 1800.000 7503.000
... ... ...
21592 1530.000 1509.000
21593 1830.000 7200.000
21594 1020.000 2007.000
21595 1410.000 1287.000
21596 1020.000 1357.000
[21597 rows x 21 columns]
In [23]:
corr = df_numeric.corr()
print(corr)
sale_id price id bedrooms bathrooms sqft_living \
sale_id 1.000 0.027 0.007 0.010 0.107 0.045
price 0.027 1.000 -0.017 0.309 0.526 0.702
id 0.007 -0.017 1.000 0.001 0.005 -0.012
bedrooms 0.010 0.309 0.001 1.000 0.515 0.578
bathrooms 0.107 0.526 0.005 0.515 1.000 0.756
sqft_living 0.045 0.702 -0.012 0.578 0.756 1.000
sqft_lot -0.027 0.090 -0.132 0.032 0.088 0.173
floors 0.180 0.257 0.019 0.178 0.503 0.354
waterfront -0.008 0.264 -0.004 -0.002 0.064 0.105
view -0.013 0.393 0.012 0.078 0.186 0.282
condition -0.096 0.036 -0.024 0.026 -0.126 -0.059
grade 0.082 0.668 0.008 0.357 0.666 0.763
sqft_above 0.072 0.605 -0.011 0.479 0.687 0.876
sqft_basement -0.040 0.321 -0.004 0.297 0.278 0.429
yr_built 0.200 0.054 0.022 0.156 0.507 0.318
yr_renovated -0.022 0.118 -0.011 0.018 0.047 0.051
zipcode -0.002 -0.053 -0.008 -0.154 -0.205 -0.200
lat 0.001 0.307 -0.002 -0.010 0.024 0.052
long 0.009 0.022 0.021 0.132 0.225 0.241
sqft_living15 0.027 0.585 -0.003 0.393 0.570 0.756
sqft_lot15 -0.030 0.083 -0.139 0.031 0.088 0.184
sqft_lot floors waterfront view ... grade sqft_above \
sale_id -0.027 0.180 -0.008 -0.013 ... 0.082 0.072
price 0.090 0.257 0.264 0.393 ... 0.668 0.605
id -0.132 0.019 -0.004 0.012 ... 0.008 -0.011
bedrooms 0.032 0.178 -0.002 0.078 ... 0.357 0.479
bathrooms 0.088 0.503 0.064 0.186 ... 0.666 0.687
sqft_living 0.173 0.354 0.105 0.282 ... 0.763 0.876
sqft_lot 1.000 -0.005 0.021 0.075 ... 0.115 0.184
floors -0.005 1.000 0.021 0.028 ... 0.459 0.524
waterfront 0.021 0.021 1.000 0.381 ... 0.083 0.072
view 0.075 0.028 0.381 1.000 ... 0.249 0.166
condition -0.009 -0.264 0.017 0.046 ... -0.147 -0.159
grade 0.115 0.459 0.083 0.249 ... 1.000 0.756
sqft_above 0.184 0.524 0.072 0.166 ... 0.756 1.000
sqft_basement 0.015 -0.242 0.083 0.271 ... 0.166 -0.051
yr_built 0.053 0.489 -0.024 -0.054 ... 0.448 0.424
yr_renovated 0.005 0.004 0.074 0.090 ... 0.016 0.021
zipcode -0.130 -0.060 0.029 0.085 ... -0.186 -0.262
lat -0.086 0.049 -0.012 0.006 ... 0.114 -0.001
long 0.230 0.126 -0.038 -0.078 ... 0.200 0.345
sqft_living15 0.145 0.280 0.084 0.279 ... 0.714 0.732
sqft_lot15 0.718 -0.011 0.031 0.073 ... 0.121 0.195
sqft_basement yr_built yr_renovated zipcode lat long \
sale_id -0.040 0.200 -0.022 -0.002 0.001 0.009
price 0.321 0.054 0.118 -0.053 0.307 0.022
id -0.004 0.022 -0.011 -0.008 -0.002 0.021
bedrooms 0.297 0.156 0.018 -0.154 -0.010 0.132
bathrooms 0.278 0.507 0.047 -0.205 0.024 0.225
sqft_living 0.429 0.318 0.051 -0.200 0.052 0.241
sqft_lot 0.015 0.053 0.005 -0.130 -0.086 0.230
floors -0.242 0.489 0.004 -0.060 0.049 0.126
waterfront 0.083 -0.024 0.074 0.029 -0.012 -0.038
view 0.271 -0.054 0.090 0.085 0.006 -0.078
condition 0.169 -0.362 -0.056 0.003 -0.015 -0.106
grade 0.166 0.448 0.016 -0.186 0.114 0.200
sqft_above -0.051 0.424 0.021 -0.262 -0.001 0.345
sqft_basement 1.000 -0.130 0.065 0.073 0.110 -0.142
yr_built -0.130 1.000 -0.203 -0.347 -0.148 0.410
yr_renovated 0.065 -0.203 1.000 0.062 0.028 -0.065
zipcode 0.073 -0.347 0.062 1.000 0.267 -0.564
lat 0.110 -0.148 0.028 0.267 1.000 -0.135
long -0.142 0.410 -0.065 -0.564 -0.135 1.000
sqft_living15 0.199 0.326 0.001 -0.279 0.049 0.336
sqft_lot15 0.016 0.071 0.004 -0.147 -0.086 0.256
sqft_living15 sqft_lot15
sale_id 0.027 -0.030
price 0.585 0.083
id -0.003 -0.139
bedrooms 0.393 0.031
bathrooms 0.570 0.088
sqft_living 0.756 0.184
sqft_lot 0.145 0.718
floors 0.280 -0.011
waterfront 0.084 0.031
view 0.279 0.073
condition -0.093 -0.003
grade 0.714 0.121
sqft_above 0.732 0.195
sqft_basement 0.199 0.016
yr_built 0.326 0.071
yr_renovated 0.001 0.004
zipcode -0.279 -0.147
lat 0.049 -0.086
long 0.336 0.256
sqft_living15 1.000 0.184
sqft_lot15 0.184 1.000
[21 rows x 21 columns]
In [24]:
import plotly.express as px
fig = px.imshow(corr,
text_auto=True,
height = 1000, width = 1000,
color_continuous_midpoint = 0.0,
range_color=[-1, 1],
title="Correlation Matrix")
fig.show()
In [25]:
df_numeric_new = df_new.select_dtypes(include='number')
print(df_numeric_new)
sale_id price id bedrooms bathrooms sqft_living \
0 1 221900.000 7129300520 3.000 1.000 1180.000
1 2 538000.000 6414100192 3.000 2.250 2570.000
2 3 180000.000 5631500400 2.000 1.000 770.000
3 4 604000.000 2487200875 4.000 3.000 1960.000
4 5 510000.000 1954400510 3.000 2.000 1680.000
... ... ... ... ... ... ...
21592 21593 360000.000 263000018 3.000 2.500 1530.000
21593 21594 400000.000 6600060120 4.000 2.500 2310.000
21594 21595 402101.000 1523300141 2.000 0.750 1020.000
21595 21596 400000.000 291310100 3.000 2.500 1600.000
21596 21597 325000.000 1523300157 2.000 0.750 1020.000
sqft_lot floors waterfront view ... grade sqft_above \
0 5650.000 1.000 0.000 0.000 ... 7 1180.000
1 7242.000 2.000 0.000 0.000 ... 7 2170.000
2 10000.000 1.000 0.000 0.000 ... 6 770.000
3 5000.000 1.000 0.000 0.000 ... 7 1050.000
4 8080.000 1.000 0.000 0.000 ... 8 1680.000
... ... ... ... ... ... ... ...
21592 1131.000 3.000 0.000 0.000 ... 8 1530.000
21593 5813.000 2.000 0.000 0.000 ... 8 2310.000
21594 1350.000 2.000 0.000 0.000 ... 7 1020.000
21595 2388.000 2.000 0.000 0.000 ... 8 1600.000
21596 1076.000 2.000 0.000 0.000 ... 7 1020.000
sqft_basement yr_built yr_renovated zipcode lat long \
0 0.000 1955 0.000 98178 47.511 -122.257
1 400.000 1951 19910.000 98125 47.721 -122.319
2 0.000 1933 0.000 98028 47.738 -122.233
3 910.000 1965 0.000 98136 47.521 -122.393
4 0.000 1987 0.000 98074 47.617 -122.045
... ... ... ... ... ... ...
21592 0.000 2009 0.000 98103 47.699 -122.346
21593 0.000 2014 0.000 98146 47.511 -122.362
21594 0.000 2009 0.000 98144 47.594 -122.299
21595 0.000 2004 0.000 98027 47.535 -122.069
21596 0.000 2008 0.000 98144 47.594 -122.299
sqft_living15 sqft_lot15
0 1340.000 5650.000
1 1690.000 7639.000
2 2720.000 8062.000
3 1360.000 5000.000
4 1800.000 7503.000
... ... ...
21592 1530.000 1509.000
21593 1830.000 7200.000
21594 1020.000 2007.000
21595 1410.000 1287.000
21596 1020.000 1357.000
[21597 rows x 21 columns]
In [26]:
corr_new = df_numeric_new.corr()
print(corr_new)
sale_id price id bedrooms bathrooms sqft_living \
sale_id 1.000 0.027 0.007 0.010 0.107 0.045
price 0.027 1.000 -0.017 0.309 0.526 0.702
id 0.007 -0.017 1.000 0.001 0.005 -0.012
bedrooms 0.010 0.309 0.001 1.000 0.515 0.578
bathrooms 0.107 0.526 0.005 0.515 1.000 0.756
sqft_living 0.045 0.702 -0.012 0.578 0.756 1.000
sqft_lot -0.027 0.090 -0.132 0.032 0.088 0.173
floors 0.180 0.257 0.019 0.178 0.503 0.354
waterfront -0.008 0.264 -0.004 -0.002 0.064 0.105
view -0.013 0.393 0.012 0.078 0.186 0.282
condition -0.096 0.036 -0.024 0.026 -0.126 -0.059
grade 0.082 0.668 0.008 0.357 0.666 0.763
sqft_above 0.072 0.605 -0.011 0.479 0.687 0.876
sqft_basement -0.040 0.321 -0.004 0.297 0.278 0.429
yr_built 0.200 0.054 0.022 0.156 0.507 0.318
yr_renovated -0.022 0.118 -0.011 0.018 0.047 0.051
zipcode -0.002 -0.053 -0.008 -0.154 -0.205 -0.200
lat 0.001 0.307 -0.002 -0.010 0.024 0.052
long 0.009 0.022 0.021 0.132 0.225 0.241
sqft_living15 0.027 0.585 -0.003 0.393 0.570 0.756
sqft_lot15 -0.030 0.083 -0.139 0.031 0.088 0.184
sqft_lot floors waterfront view ... grade sqft_above \
sale_id -0.027 0.180 -0.008 -0.013 ... 0.082 0.072
price 0.090 0.257 0.264 0.393 ... 0.668 0.605
id -0.132 0.019 -0.004 0.012 ... 0.008 -0.011
bedrooms 0.032 0.178 -0.002 0.078 ... 0.357 0.479
bathrooms 0.088 0.503 0.064 0.186 ... 0.666 0.687
sqft_living 0.173 0.354 0.105 0.282 ... 0.763 0.876
sqft_lot 1.000 -0.005 0.021 0.075 ... 0.115 0.184
floors -0.005 1.000 0.021 0.028 ... 0.459 0.524
waterfront 0.021 0.021 1.000 0.381 ... 0.083 0.072
view 0.075 0.028 0.381 1.000 ... 0.249 0.166
condition -0.009 -0.264 0.017 0.046 ... -0.147 -0.159
grade 0.115 0.459 0.083 0.249 ... 1.000 0.756
sqft_above 0.184 0.524 0.072 0.166 ... 0.756 1.000
sqft_basement 0.015 -0.242 0.083 0.271 ... 0.166 -0.051
yr_built 0.053 0.489 -0.024 -0.054 ... 0.448 0.424
yr_renovated 0.005 0.004 0.074 0.090 ... 0.016 0.021
zipcode -0.130 -0.060 0.029 0.085 ... -0.186 -0.262
lat -0.086 0.049 -0.012 0.006 ... 0.114 -0.001
long 0.230 0.126 -0.038 -0.078 ... 0.200 0.345
sqft_living15 0.145 0.280 0.084 0.279 ... 0.714 0.732
sqft_lot15 0.718 -0.011 0.031 0.073 ... 0.121 0.195
sqft_basement yr_built yr_renovated zipcode lat long \
sale_id -0.040 0.200 -0.022 -0.002 0.001 0.009
price 0.321 0.054 0.118 -0.053 0.307 0.022
id -0.004 0.022 -0.011 -0.008 -0.002 0.021
bedrooms 0.297 0.156 0.018 -0.154 -0.010 0.132
bathrooms 0.278 0.507 0.047 -0.205 0.024 0.225
sqft_living 0.429 0.318 0.051 -0.200 0.052 0.241
sqft_lot 0.015 0.053 0.005 -0.130 -0.086 0.230
floors -0.242 0.489 0.004 -0.060 0.049 0.126
waterfront 0.083 -0.024 0.074 0.029 -0.012 -0.038
view 0.271 -0.054 0.090 0.085 0.006 -0.078
condition 0.169 -0.362 -0.056 0.003 -0.015 -0.106
grade 0.166 0.448 0.016 -0.186 0.114 0.200
sqft_above -0.051 0.424 0.021 -0.262 -0.001 0.345
sqft_basement 1.000 -0.130 0.065 0.073 0.110 -0.142
yr_built -0.130 1.000 -0.203 -0.347 -0.148 0.410
yr_renovated 0.065 -0.203 1.000 0.062 0.028 -0.065
zipcode 0.073 -0.347 0.062 1.000 0.267 -0.564
lat 0.110 -0.148 0.028 0.267 1.000 -0.135
long -0.142 0.410 -0.065 -0.564 -0.135 1.000
sqft_living15 0.199 0.326 0.001 -0.279 0.049 0.336
sqft_lot15 0.016 0.071 0.004 -0.147 -0.086 0.256
sqft_living15 sqft_lot15
sale_id 0.027 -0.030
price 0.585 0.083
id -0.003 -0.139
bedrooms 0.393 0.031
bathrooms 0.570 0.088
sqft_living 0.756 0.184
sqft_lot 0.145 0.718
floors 0.280 -0.011
waterfront 0.084 0.031
view 0.279 0.073
condition -0.093 -0.003
grade 0.714 0.121
sqft_above 0.732 0.195
sqft_basement 0.199 0.016
yr_built 0.326 0.071
yr_renovated 0.001 0.004
zipcode -0.279 -0.147
lat 0.049 -0.086
long 0.336 0.256
sqft_living15 1.000 0.184
sqft_lot15 0.184 1.000
[21 rows x 21 columns]
In [27]:
import plotly.express as px
fig = px.imshow(corr_new,
text_auto=True,
height = 1000, width = 1000,
color_continuous_scale="Greens",
color_continuous_midpoint = 0.0,
range_color=[-1, 1],
title="Correlation Matrix")
fig.show()
Hypothesis 1 : Houses located closer to the city center have higher prices than those far away.¶
In [28]:
import plotly.express as px
# Make sure you are in Jupyter Notebook or JupyterLab
# This keeps the plot inline
import plotly.io as pio
pio.renderers.default = "notebook" # or "notebook_connected"
df_new['price_bin'] = pd.qcut(df_new['price'], q=6, labels=False)
fig = px.scatter_mapbox(
df_new,
lat='lat',
lon='long',
color='price_bin',
hover_data=['price', 'zipcode'],
zoom=10,
height=1000,
title='Seattle Price Map',
color_continuous_scale='Turbo'
)
fig.update_layout(mapbox_style="open-street-map") # street map style
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
In [ ]:
In [29]:
city_center_lat = 47.6062
city_center_lon = -122.3321
# Subtract Seattle city center coordinates
df_new["lat_dc"] = df_new["lat"] - city_center_lat
df_new["long_dc"] = df_new["long"] - city_center_lon # ✅ use city_center_lon
# Convert to approximate km
df_new["lat_km"] = df_new["lat_dc"] * 111 # 1° latitude ≈ 111 km
df_new["long_km"] = df_new["long_dc"] * 85 # 1° longitude ≈ 85 km in Seattle
In [30]:
import numpy as np
def haversine(lat1, lon1, lat2, lon2):
R = 6371 # Earth radius in km
dlat = np.radians(lat2 - lat1)
dlon = np.radians(lon2 - lon1)
a = np.sin(dlat/2)**2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
return R * c
# Apply to your dataframe
df_new["distance_from_center_km"] = df_new.apply(
lambda row: haversine(city_center_lat, city_center_lon, row["lat"], row["long"]), axis=1
)
In [31]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
df_new["distance_bin"] = pd.cut(df_new["distance_from_center_km"], bins=6)
counts = df_new.groupby(["distance_bin", "price_bin"]).size().reset_index(name="count")
# Pivot the table so distance_bin is x, price_bin is y
heatmap_data = counts.pivot(index="price_bin", columns="distance_bin", values="count")
# Get 6 discrete colors from turbo colormap (for 6 price bins)
colors = cm.turbo(np.linspace(0, 1, len(df_new["price_bin"].unique())))
plt.figure(figsize=(8,6))
sns.barplot(x="distance_bin", y="count", hue="price_bin", data=counts, palette=colors)
plt.xlabel("Distance from Seattle City Center (km)")
plt.ylabel("Number of Properties")
plt.title("Number of Properties by Distance and Price Bin")
plt.xticks(rotation=45)
plt.legend(title="Price Bin")
plt.show()
Hypothesis 2 : Homes with larger living sizes have higher prices, independent of house size.¶
In [32]:
plt.figure(figsize=(8,6))
sns.scatterplot(
x='sqft_living',
y='price',
data=df_new,
alpha=0.4
)
plt.title("Price vs Living Space")
plt.xlabel("Living Space (sqft)")
plt.ylabel("Price ($)")
plt.tight_layout()
plt.show()
In [33]:
plt.figure(figsize=(8,6))
sns.scatterplot(
x='sqft_living15',
y='price',
data=df_new,
alpha=0.4
)
plt.title("Price vs Living Space")
plt.xlabel("Living Space (sqft)")
plt.ylabel("Price ($)")
plt.tight_layout()
plt.show()
In [34]:
plt.figure(figsize=(10,6))
sns.boxplot(
x='bathrooms',
y='price',
data=df_new,
showfliers=False
)
plt.title("Price Distribution by Number of Bathrooms")
plt.show()
In [35]:
plt.figure(figsize=(10,6))
sns.boxplot(
x='bedrooms',
y='price',
data=df_new,
showfliers=False
)
plt.title("Price Distribution by Number of Bedrooms")
plt.show()
In [36]:
import plotly.express as px
# Make sure you are in Jupyter Notebook or JupyterLab
# This keeps the plot inline
import plotly.io as pio
#pio.renderers.default = "notebook" # or "notebook_connected"
bins = [0, 6, 9, 13] # Define grade edges
labels = ['Low', 'Medium', 'High'] # Labels for bins
df_new['grade_bin'] = pd.cut(df_new['grade'], bins=bins, labels=labels)
fig = px.scatter_mapbox(
df_new,
lat='lat',
lon='long',
color='grade_bin',
hover_data=['price', 'zipcode'],
zoom=10,
height=1000,
title='Seattle Price Map',
color_continuous_scale='Plasma'
)
fig.update_layout(mapbox_style="open-street-map") # street map style
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
fig.write_html("price_vs_distance.html", include_plotlyjs="cdn")
In [37]:
import pandas as pd
# Example: df['grade'] exists
bins = [0, 6, 9, 13] # define edges
labels = ['Low', 'Medium', 'High'] # bin labels
df_new['grade_bin'] = pd.cut(df_new['grade'], bins=bins, labels=labels)
In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Example dataset
# df should have 'price' and 'grade' columns
# df = pd.read_csv('your_data.csv')
# 1️⃣ Bin the grades
bins = [0, 6, 9, 13] # Define grade edges
labels = ['Low', 'Medium', 'High'] # Labels for bins
df_new['grade_bin'] = pd.cut(df_new['grade'], bins=bins, labels=labels)
# 2️⃣ Aggregate by median price
summary = df_new.groupby('grade_bin')['price'].describe()
print(summary)
summary = df_new.groupby('grade_bin')['price'].agg(
mean_price='mean',
median_price='median',
std_price='std',
q25=lambda x: x.quantile(0.25),
q75=lambda x: x.quantile(0.75)
).reset_index()
#print(summary)
# Set Seaborn style
sns.set_style("whitegrid")
summary_melted = summary.melt(
id_vars='grade_bin',
var_name='Statistic',
value_name='Price'
)
plt.figure(figsize=(8,6))
sns.barplot(
x='grade_bin',
y='Price',
hue='Statistic',
data=summary_melted
)
plt.title("Statistics of House Price by Grade Bin")
plt.xlabel("Grade Bin")
plt.ylabel("Price ($)")
plt.legend(title="Statistic")
plt.tight_layout()
plt.show()
count mean std min 25% 50% \
grade_bin
Low 2308.000 295249.056 123537.868 78000.000 210000.000 270000.000
Medium 17654.000 505801.750 244859.084 90000.000 335000.000 454925.000
High 1635.000 1258670.451 707372.118 316000.000 812750.000 1040000.000
75% max
grade_bin
Low 358000.000 1200000.000
Medium 615000.000 3070000.000
High 1490000.000 7700000.000